# -*- coding: utf-8 -*-
"""
此处功能用于数据分析
"""
import pandas as pd
from collections import Counter  # 用于标签统计
from settings import path

def train():
    data = pd.read_csv(path.path_train_txt, names=['text', 'label'], sep='\t')
    # print(data.head(5))
    label_counts = Counter(data["label"])
    # 计算标签的比例
    for label, count in label_counts.items():
        # print(len(data['label']), count)
        percent = (count / len(data['label']))
        # print(percent)
    # 分析文本长度
    data['text_length'] = data['text'].str.len()

    # print(data[['text', 'text_length']].head(10))  # 只显示文本和长度列
    text_mean = data['text_length'].mean()
    text_std = data['text_length'].std()  # 标准差
    text_max = data['text_length'].max()
    text_min = data['text_length'].min()
    # print(text_mean)
    # print(text_max)
    print(text_min)
